// Files needed in working directory to run
// Modified 5-14 sept 2023 by PR
// Modifeid OCt 21 2023 by SA

// Metabolic_syndrome_peter-20221130_913.xlsx (starting point)
// WHO population data.dta
// PPI data prior to 2020.csv
// dataset_step4_pr.dta (dataset after the stoplight data merge; cannot be generated from this do file without FV2022.xlsx, therefore this step is commented out)
// localidad departmento.dta (for converting localidad data into department data)
// localidad departamento2.dta (for messed up formatting with Spanish accents)


// The following files will be generated, among others
//
// dataset step5.dta (immediately prior to the PPI merge) (used in the ppi merge)
// dataset step8.dta (dataset that is used for the multiple imputation)
// dataset step9.dta (dataset after multiple imputation)

clear

// ssc install strdate

import excel "Metabolic_syndrome_peter-20221130_913.xlsx", sheet("Metabolic_syndrome_peter-202211") firstrow clear
rename HistoriaMÃdicaPuente HistoriaMdicaPuente
rename HistoriaMÃdicaPuente_obsDatet HistoriaMdicaPuente_obsDatet
rename HistoriaMÃdicaPuente_1 HistoriaMdicaPuente_1
rename HistoriaMÃdicaPuente_2 HistoriaMdicaPuente_2
rename HistoriaMÃdicaPuente_3 HistoriaMdicaPuente_3
rename HistoriaMÃdicaPuente_4 HistoriaMdicaPuente_4
rename HistoriaMÃdicaPuente_5 HistoriaMdicaPuente_5
rename HistoriaMÃdicaPuente_6 HistoriaMdicaPuente_6
rename HistoriaMÃdicaPuente_7 HistoriaMdicaPuente_7
rename HistoriaMÃdicaPuente_8 HistoriaMdicaPuente_8
rename HistoriaMÃdicaPuente_9 HistoriaMdicaPuente_9
rename presiÃnarterialsistÃlica presinarterialsistlica
rename presiÃnarterialsistÃlica_obs presinarterialsistlica_obs
rename presiÃnarterialsistÃlica_enc presinarterialsistlica_enc
/*
rename presiÃnarterialsistÃlica_1 presinarterialsistlica_1
rename presiÃnarterialsistÃlica_2 presinarterialsistlica_2
rename presiÃnarterialsistÃlica_3 presinarterialsistlica_3
rename presiÃnarterialsistÃlica_4 presinarterialsistlica_4
rename presiÃnarterialsistÃlica_5 presinarterialsistlica_5
rename presiÃnarterialsistÃlica_6 presinarterialsistlica_6
rename presiÃnarterialsistÃlica_7 presinarterialsistlica_7
rename presiÃnarterialsistÃlica_8 presinarterialsistlica_8
rename presiÃnarterialsistÃlica_9 presinarterialsistlica_9*/
rename PresiÃnarterialdiastÃlica Presinarterialdiastlica
rename PresiÃnarterialdiastÃlica_ob Presinarterialdiastlica_ob
rename PresiÃnarterialdiastÃlica_en Presinarterialdiastlica_en
/*rename PresiÃnarterialdiastÃlica_1 Presinarterialdiastlica_1
rename PresiÃnarterialdiastÃlica_2 Presinarterialdiastlica_2
rename PresiÃnarterialdiastÃlica_3 Presinarterialdiastlica_3
rename PresiÃnarterialdiastÃlica_4 Presinarterialdiastlica_4
rename PresiÃnarterialdiastÃlica_5 Presinarterialdiastlica_5
rename PresiÃnarterialdiastÃlica_6 Presinarterialdiastlica_6
rename PresiÃnarterialdiastÃlica_7 Presinarterialdiastlica_7
rename PresiÃnarterialdiastÃlica_8 Presinarterialdiastlica_8
rename PresiÃnarterialdiastÃlica_9 Presinarterialdiastlica_9*/

rename PuentePlanificaciÃnFamiliar PuentePlanificacinFamiliar
rename PuentePlanificaciÃnFamiliar_o PuentePlanificacinFamiliar_o
rename PuentePlanificaciÃnFamiliar_e PuentePlanificacinFamiliar_e





																		// peter cleaning splv.do //												

/// peter rohloff 28 nov 2022
// cleaning of Salud para la Vida/Puente de Amista data set 
// cardiometabolic descriptive project

// think about duplicate mrs ids

. duplicates list OldIdentificationNumber

// I suggest dropping records with weird/corrupted medical record numbers - ? test patients

drop if inlist(OldIdentificationNumber, "9q337", "A50220")

drop if inlist(OldIdentificationNumber, "Culum", "JF61022", "YJ200")


drop if inlist(OldIdentificationNumber, "2/9/2018", "5/8/2018", "2/20/2018", "0", "1")

drop if OldIdentificationNumber=="  5/8/2018"
drop if OldIdentificationNumber=="  2/9/2018"
drop if OldIdentificationNumber==" 2/20/2018"

destring OldIdentificationNumber, replace

gen byte notnumeric = real(OldIdentificationNumber)==.

tab notnumeric
list OldIdentificationNumber if notnumeric==1

drop if OldIdentificationNumber=="18-08-2017- 01"
drop if OldIdentificationNumber=="2575 62133 1416"
drop if OldIdentificationNumber=="48121.."
drop if OldIdentificationNumber=="76440.."
drop if OldIdentificationNumber=="M9112022"

destring OldIdentificationNumber, replace


sort OldIdentificationNumber
duplicates list OldIdentificationNumber

// 4 lines below, OldIdentificationNumber is in string format, added "" and if missing() (9/16, SA)

drop if OldIdentificationNumber == "2" & Edad == 29

drop if OldIdentificationNumber == "9102" & !missing(Localidad)

drop if OldIdentificationNumber == "76130718" & !missing(Localidad)

drop if missing(OldIdentificationNumber)


//look at validity of age data
sort Edad
//I think we can keep weird data just change age to missing
replace Edad=. if Edad<18
//on second thought, I think we drop for age less than 18 anyway
drop if Edad == .
//upper age ranges ae believable

//clean up location orthography


replace Localidad = subinstr(Localidad, "√°", "a", .)
replace Localidad = subinstr(Localidad, "√©", "e", .)
replace Localidad = subinstr(Localidad, "√∫", "u", .)
replace Localidad = subinstr(Localidad, "√≥", "o", .)
replace Localidad = subinstr(Localidad, "√≠", "i", .)


//a note on date stamps - there are many of them - where ever possible I extracted them. if it is an observation date stamp that is autogenerated by Openmrs when the data is filed. Encounter dates on the other hand are manual. I would mostly go by the observation date. I don't think Encounter date is accurate both because sometimes it is not entered by user but also because it is the same variable across all the different forms that we extracted, and the data extractor just arbitrarily (or maybe not arbritrarily but nevertheless) only picks one Encounter date to extract. So Encounter date is probably mostly not entrustable. Use the observation date stamp for the variable of interest. 

// a note on age and date stamps - age is age at the time I extracted this data set!!!!  Have to pay attention to this when doing any age categorized descriptions, remember to use age when data was obtained not age at time of data extraction (back calculate using dob

//note on blood glucose data. the work flow is a random blood glucose. If abnormal then a second visit schedule for fasting blood glucose. Almost no one gets A1C. This is why there are two variables for glucose and multiple columns. Generally another clue is encounter type (adultonuevo vs reconsulta) with nuevo generally being first visit (random glucose) and reconsulta second (fasting) but I'm not sure that is completley accurate as sometimes a step may be skipped for timing reasons and picked up in a second visit.Also a nurse may have entered two lab values on same date (random and fasting) if they were logging data at a remote date from the actual encounter.

//medical history - multiple columns here because this is a multiple choice drop down and so there will be as many columns as answers (up to 9 columns in our data set). All this has to be compiled for each patient. There is no consistency on which condition appears in which column

//here I'm cleaning up a bunch of garbage and covid related stuff. 


foreach var of varlist HistoriaMdicaPuente HistoriaMdicaPuente_1 HistoriaMdicaPuente_2 HistoriaMdicaPuente_3 HistoriaMdicaPuente_4 HistoriaMdicaPuente_5 HistoriaMdicaPuente_6 HistoriaMdicaPuente_7 HistoriaMdicaPuente_8 HistoriaMdicaPuente_9 { 
	replace `var'="." if (`var' == "Cough" | `var' == "Dyspnea"  | `var' == "FAMILY PLANNING" | `var' == "Mania (Monopolar) Single Episode or Unspecified" | `var' == "Sore throat" | `var' == "None" )
}

//relabelling for clarity. Gu surgery is mostly colposcopy or cervical cancer biopsy or hysterectomy

foreach var of varlist HistoriaMdicaPuente HistoriaMdicaPuente_1 HistoriaMdicaPuente_2 HistoriaMdicaPuente_3 HistoriaMdicaPuente_4 HistoriaMdicaPuente_5 HistoriaMdicaPuente_6 HistoriaMdicaPuente_7 HistoriaMdicaPuente_8 HistoriaMdicaPuente_9 { 
	replace `var'="History of STI" if (`var' == "Patient is contact of known or suspected infectious case")
}
	
	
foreach var of varlist HistoriaMdicaPuente HistoriaMdicaPuente_1 HistoriaMdicaPuente_2 HistoriaMdicaPuente_3 HistoriaMdicaPuente_4 HistoriaMdicaPuente_5 HistoriaMdicaPuente_6 HistoriaMdicaPuente_7 HistoriaMdicaPuente_8 HistoriaMdicaPuente_9 { 
	replace `var'="GU surgery" if (`var' == "Personal history of surgery on urogenital tract")
}

// medications are gonna take some work. I leave it to you to decide if worth while. We can use the substring function to extract chunks of text into new variables or replace text. you have to be careful however as lots of the freetext includes multipole names/strings. 

//menopausal and FP clean up labels

replace Menstrualstatus = "Perimenopausal" if Menstrualstatus == "Perimenopausal status"

replace Menstrualstatus = "Premenopausal" if Menstrualstatus == "premenopausal patient"

replace Menstrualstatus = "Postmenopausal" if Menstrualstatus == "Postmenopausal Status (Age-Related) (Natural)"

replace Familyplanningstatus = "Current" if Familyplanningstatus == "CURRENTLY USING BIRTH CONTROL"


replace Familyplanningstatus = "Former" if Familyplanningstatus == "Lapsed user of family planning method"


replace Familyplanningstatus = "Never" if Familyplanningstatus == "Patient not using family planning"


// I don't think we need types of family planning, delete

drop PuentePlanificacinFamiliar PuentePlanificacinFamiliar_o PuentePlanificacinFamiliar_e


// RAce isn't useful the nurses are clearly using it for something else - I think Friendship bridge loan cycle 

drop Race

//agency and healthdistrict are puente loan groups and other admin units I don't totally understand but we might be able to use them for a merge if we can get puente SES data

// Ethnicity

replace Etnia="." if Etnia == "1"

replace Etnia="." if Etnia == "3"
//1=indigenous 2=not intigenous

replace Etnia="2" if Etnia == "Castellana" | Etnia == "ESPA" | Etnia == "ESPA√ëOL" | Etnia == "Eapa√±ol" | Etnia == "Espa√±ol" | Etnia == "LADINA" | Etnia == "LADINO" | Etnia == "Ladia" |Etnia == "Ladina" | Etnia == "Ladino" | Etnia == "Ladkna" | Etnia == "Latina" | Etnia == "MESTIZA" | Etnia == "MESTIZO" | Etnia == "MESTIO}" | Etnia == "Mestiza" | Etnia == "Mestizo" | Etnia == "espa√±ol" | Etnia == "ladina" | Etnia == "ladino" | Etnia == "mestiza" | Etnia == "mestizo" 


replace Etnia="1" if Etnia != "2" &  Etnia != "."


//clean up language
//make everything lowercase
replace IdiomaPreferido = ustrlower(IdiomaPreferido)
//clean up spellings, order is important to start with indigenous language and so many mention two languages and I have recateogrized all of these based on indigenous language>spanish

replace IdiomaPreferido="tzutujil" if ustrregexm(IdiomaPreferido,"tzu")
replace IdiomaPreferido="tzutujil" if ustrregexm(IdiomaPreferido,"sutu")
replace IdiomaPreferido="tzutujil" if ustrregexm(IdiomaPreferido,"zutu")
replace IdiomaPreferido="mam" if ustrregexm(IdiomaPreferido,"mam")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"quic")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"kich")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"kiqu")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"kuic")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"chich")
replace IdiomaPreferido="ixil" if ustrregexm(IdiomaPreferido,"ixi")
replace IdiomaPreferido="kaqchikel" if ustrregexm(IdiomaPreferido,"kaqc")
replace IdiomaPreferido="kaqchikel" if ustrregexm(IdiomaPreferido,"kach")
replace IdiomaPreferido="kaqchikel" if ustrregexm(IdiomaPreferido,"caq")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"k'ich")
replace IdiomaPreferido="kaqchikel" if ustrregexm(IdiomaPreferido,"kak")
replace IdiomaPreferido="." if IdiomaPreferido == "1"
replace IdiomaPreferido="spanish" if ustrregexm(IdiomaPreferido,"spa")
replace IdiomaPreferido="spanish" if ustrregexm(IdiomaPreferido,"cast")
replace IdiomaPreferido="kiche" if ustrregexm(IdiomaPreferido,"kick")
replace IdiomaPreferido="spanish" if ustrregexm(IdiomaPreferido,"√±")
replace IdiomaPreferido="sacapulteco" if ustrregexm(IdiomaPreferido,"sacapul")
replace IdiomaPreferido="indigena" if ustrregexm(IdiomaPreferido,"maya")
replace IdiomaPreferido="spanish" if ustrregexm(IdiomaPreferido,"esp")
replace IdiomaPreferido="spanish" if ustrregexm(IdiomaPreferido,"ladin")


//make all the dates string--> dates --- this is the part that needed updated 5 sept 2023


strdate FechadeEncuentro,s("md20y") f(%td)

// dob already in correct date format, commenting out (SA, 9/16) (may be a version issue)

// strdate dob, s("md20y") f(%td)
 
// strdate  Glucosaalazar_obsDatetime glucosaenayunas_obsDatetime HistoriaMdicaPuente_obsDatet Bloodpressuremedication_obsDat Weightkg_obsDatetime Heightcm_obsDatetime presinarterialsistlica_obs Presinarterialdiastlica_ob SerumcreatinineumolL_obsDat, s("md20y") f(%td)

//drop redundant dates for multiple medical history points
// drop N P R T V X Z AB AD
//not relevant for this export


//clean up height data

// hist Heightcm
replace Heightcm=. if Heightcm < 100

//again extremely conservative. probably not really many women less than 4 feet but definitely possible


//there are only 19 A1C measurements and there are 34 creatinine (these later are all normal range). so deleted all these variables, they aren't going to be useful
drop glycosylatedhemoglobinAmeasur-SerumcreatinineumolL_encoun


// clean up bp meds

table Bloodpressuremedication

replace Bloodpressuremedication = ustrlower(Bloodpressuremedication)


replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no toma")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "ningun")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no esta")
replace Bloodpressuremedication="telmisartan, amlodipina" if ustrregexm(Bloodpressuremedication, "telmisartan/amlodipina")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "medico privado")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "san juan")
replace Bloodpressuremedication="irbesartan" if ustrregexm(Bloodpressuremedication, "iberz")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "paciente toma medicamen")
replace Bloodpressuremedication="natural" if ustrregexm(Bloodpressuremedication, "alternativa")
replace Bloodpressuremedication="natural" if ustrregexm(Bloodpressuremedication, "natural")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "desconoc")
//I read through to make sure commons weren't duplicated with other meds they aren't so the following command doesn't delete additional meds
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "enalap")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "anterior")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "toma medicament")
replace Bloodpressuremedication="amlodipina" if ustrregexm(Bloodpressuremedication, " amlodip")
replace Bloodpressuremedication="losartan" if ustrregexm(Bloodpressuremedication, "losart")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "privado")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "centro")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "esta con tratamiento")
replace Bloodpressuremedication="amlodipina" if ustrregexm(Bloodpressuremedication, "amlodipi")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "anal")
replace Bloodpressuremedication="." if Bloodpressuremedication == "0"
replace Bloodpressuremedication="vymada" if ustrregexm(Bloodpressuremedication, "vymada")
replace Bloodpressuremedication="ramipril" if ustrregexm(Bloodpressuremedication, "trita")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "tomaba")
replace Bloodpressuremedication="losartan" if ustrregexm(Bloodpressuremedication, "tansi")
replace Bloodpressuremedication="." if ustrregexm(Bloodpressuremedication, "plan educac")

replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no tiene")
replace Bloodpressuremedication="ramipril" if ustrregexm(Bloodpressuremedication, "ramip")
replace Bloodpressuremedication="." if Bloodpressuremedication == "0" | Bloodpressuremedication == "60" | Bloodpressuremedication == "70" | Bloodpressuremedication == "90"
replace Bloodpressuremedication="." if Bloodpressuremedication == "antecedente de mama"
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "no recuerd")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no eata tomando")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "no sabe")
replace Bloodpressuremedication="." if ustrregexm(Bloodpressuremedication, "hablo")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "no se recuerd")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "lostrial")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "lotrial")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no,")
replace Bloodpressuremedication="irbesartan+hidroclorotiazida" if ustrregexm(Bloodpressuremedication, "irbesartan+hidroclorotiazida")
replace Bloodpressuremedication="irbesartan" if ustrregexm(Bloodpressuremedication, "ivers")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "enalp")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "enlap")
replace Bloodpressuremedication="losartan" if ustrregexm(Bloodpressuremedication, "losert")
replace Bloodpressuremedication="olmesartan" if ustrregexm(Bloodpressuremedication, "olmes")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "elap")
replace Bloodpressuremedication="losartan" if ustrregexm(Bloodpressuremedication, "satore")
replace Bloodpressuremedication="olmesartan" if ustrregexm(Bloodpressuremedication, "olmers")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "inal")
replace Bloodpressuremedication="." if Bloodpressuremedication == "p/a: 160/100 inicio"
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no acepta")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "medico de ca")
replace Bloodpressuremedication="irbesartanHCTZ" if ustrregexm(Bloodpressuremedication, "bart")
replace Bloodpressuremedication="valsartan" if ustrregexm(Bloodpressuremedication, "balsart")
replace Bloodpressuremedication="atenolol" if ustrregexm(Bloodpressuremedication, "atenol")
replace Bloodpressuremedication="candesartan" if ustrregexm(Bloodpressuremedication, "candesar")
replace Bloodpressuremedication="captopril" if ustrregexm(Bloodpressuremedication, "captop")
replace Bloodpressuremedication="captopril" if ustrregexm(Bloodpressuremedication, "capti")
replace Bloodpressuremedication="telmisartanamlodipine" if ustrregexm(Bloodpressuremedication, "micard")
replace Bloodpressuremedication="." if ustrregexm(Bloodpressuremedication, "gliben")
replace Bloodpressuremedication="." if ustrregexm(Bloodpressuremedication, "guaya")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "no le gusta")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "tiene tratamiento")
replace Bloodpressuremedication="valsartanHCTZ" if ustrregexm(Bloodpressuremedication, "valsartan/hidro")
replace Bloodpressuremedication="benzepril" if ustrregexm(Bloodpressuremedication, "benazep")
replace Bloodpressuremedication="olmesartanHCTZ" if ustrregexm(Bloodpressuremedication, "benica")
replace Bloodpressuremedication="bisoprololHCTZvalsartan" if ustrregexm(Bloodpressuremedication, "bisoblo")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "se le olvida")
replace Bloodpressuremedication="amlodipinevalsartanHCTZ" if ustrregexm(Bloodpressuremedication, "exfor")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "paciente refiere")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "paciente lleva")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "paciente es")

replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "niung")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "comenta que hoy se")
replace Bloodpressuremedication="enalapril" if ustrregexm(Bloodpressuremedication, "enapr")
replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "dejo")
replace Bloodpressuremedication="." if ustrregexm(Bloodpressuremedication, "metformin")
replace Bloodpressuremedication="indapamide" if ustrregexm(Bloodpressuremedication, "indap")
replace Bloodpressuremedication="irbesartanHCTZ" if ustrregexm(Bloodpressuremedication, "hidroclorotiazida")
replace Bloodpressuremedication="olmesartanamlodipine" if ustrregexm(Bloodpressuremedication, "balz")
replace Bloodpressuremedication="unknown" if ustrregexm(Bloodpressuremedication, "si lo han")

replace Bloodpressuremedication="none" if ustrregexm(Bloodpressuremedication, "no")


																		// Clean up medical history and diabetes medications.do //												

																		
																		
																		
// Generate single variables for medical history significant for diabetes, hypertension, liver disease and kidney disease

gen historia_diabetes = 0
label variable historia_diabetes "History of diabetes"

replace historia_diabetes = 1 if HistoriaMdicaPuente == "Diabetes Mellitus" || HistoriaMdicaPuente_1 == "Diabetes Mellitus" || HistoriaMdicaPuente_2 == "Diabetes Mellitus" || HistoriaMdicaPuente_3 == "Diabetes Mellitus" || HistoriaMdicaPuente_4 == "Diabetes Mellitus" || HistoriaMdicaPuente_5 == "Diabetes Mellitus" || HistoriaMdicaPuente_6 == "Diabetes Mellitus" || HistoriaMdicaPuente_7 == "Diabetes Mellitus" || HistoriaMdicaPuente_8 == "Diabetes Mellitus" || HistoriaMdicaPuente_9 == "Diabetes Mellitus"

gen historia_hipertension = 0
label variable historia_hipertension "History of hypertension"

replace historia_hipertension = 1 if HistoriaMdicaPuente == "Hypertension" || HistoriaMdicaPuente_1 == "Hypertension" || HistoriaMdicaPuente_2 == "Hypertension" || HistoriaMdicaPuente_3 == "Hypertension" || HistoriaMdicaPuente_4 == "Hypertension" || HistoriaMdicaPuente_5 == "Hypertension" || HistoriaMdicaPuente_6 == "Hypertension" || HistoriaMdicaPuente_7 == "Hypertension" || HistoriaMdicaPuente_8 == "Hypertension" || HistoriaMdicaPuente_9 == "Hypertension"

gen historia_enfermedad_higado = 0
label variable historia_enfermedad_higado "History of liver disease"

replace historia_enfermedad_higado = 1 if HistoriaMdicaPuente == "HEPATIC DISEASE" || HistoriaMdicaPuente_1 == "HEPATIC DISEASE" || HistoriaMdicaPuente_2 == "HEPATIC DISEASE" || HistoriaMdicaPuente_3 == "HEPATIC DISEASE" || HistoriaMdicaPuente_4 == "HEPATIC DISEASE" || HistoriaMdicaPuente_5 == "HEPATIC DISEASE" || HistoriaMdicaPuente_6 == "HEPATIC DISEASE" || HistoriaMdicaPuente_7 == "HEPATIC DISEASE" || HistoriaMdicaPuente_8 == "HEPATIC DISEASE" || HistoriaMdicaPuente_9 == "HEPATIC DISEASE"

gen historia_enfermedad_renal = 0
label variable historia_enfermedad_renal "History of kidney disease"

replace historia_enfermedad_renal = 1 if HistoriaMdicaPuente == "Kidney Disorder" || HistoriaMdicaPuente_1 == "Kidney Disorder" || HistoriaMdicaPuente_2 == "Kidney Disorder" || HistoriaMdicaPuente_3 == "Kidney Disorder" || HistoriaMdicaPuente_4 == "Kidney Disorder" || HistoriaMdicaPuente_5 == "Kidney Disorder" || HistoriaMdicaPuente_6 == "Kidney Disorder" || HistoriaMdicaPuente_7 == "Kidney Disorder" || HistoriaMdicaPuente_8 == "Kidney Disorder" || HistoriaMdicaPuente_9 == "Kidney Disorder"

order historia_diabetes historia_hipertension historia_enfermedad_higado historia_enfermedad_renal, after(glucosaenayunas_encounterType)

drop HistoriaMdicaPuente HistoriaMdicaPuente_obsDatet HistoriaMdicaPuente_1 HistoriaMdicaPuente_2 HistoriaMdicaPuente_3 HistoriaMdicaPuente_4 HistoriaMdicaPuente_5 HistoriaMdicaPuente_6 HistoriaMdicaPuente_7 HistoriaMdicaPuente_8 HistoriaMdicaPuente_9


// General separate variables for different kinds of diabetes medications (metformin, sulfonylureas, insulin, medicinal plants, other and none)

replace Medicamentoydosisparadiabete = ustrlower(Medicamentoydosisparadiabete)

gen metformina = 0
label variable metformina "Use of metformin"

replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "metform")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "diglumet")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "metglital")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "metfomina")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "metfotmina")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "inusita")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "inosita")
replace metformin = 1 if ustrregexm(Medicamentoydosisparadiabete, "zukermin")

gen sulfon = 0
label variable sulfon "Use of sulfonylurea"

replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "glibenc")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "glicazet")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "glimep")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "glimip")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "glibenlamida")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "glibeclamida")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "gimenclamida")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "diglumet")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "metglital")
replace sulfon = 1 if ustrregexm(Medicamentoydosisparadiabete, "zukermin")

gen insulina = 0
label variable insulina "Use of insulin"

replace insulina = 1 if ustrregexm(Medicamentoydosisparadiabete, "insulin")
replace insulina = 1 if ustrregexm(Medicamentoydosisparadiabete, "insilin")

gen plantas_medicinales = 0
label variable plantas_medicinales "Use of medicinal plants for diabetes"

replace plantas_medicinales = 1 if ustrregexm(Medicamentoydosisparadiabete, "natura")
replace plantas_medicinales = 1 if ustrregexm(Medicamentoydosisparadiabete, "plantas medicinales")

gen otro_medicamento_diabetes = 0
label variable otro_medicamento_diabetes "Use of other diabetes medication"

replace otro_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "inusita")
replace otro_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "inosita")
replace otro_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "januvia")
replace otro_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "jenovia")
replace otro_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "empagliflozin")

gen desconocido = 0
label variable desconocido "Use of unknown diabetes medication"

replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "tx con medico privado")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "tx en el centro")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "toma medicamentos para controlar")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "si con tratamiento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "paciente toma tratamiento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "paciente toma medicamento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "paciente sigue con su tratamiento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "paciente esta con tratamiento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "paciente lleva tratamiento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "actualmente esta tomando tratamiento")
replace desconocido = 1 if ustrregexm(Medicamentoydosisparadiabete, "desconoce nombre pero tiene tratamiento")

gen ningun_medicamento_diabetes = 0
label variable ningun_medicamento_diabetes "No diabetes medication"

replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no tiene tratamiento")
replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no toma medicamento")
replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no tomar medicamento")
replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no toma ning")
replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no esta con medicamento")
replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no esta tomando medicamento")
replace ningun_medicamento_diabetes = 1 if ustrregexm(Medicamentoydosisparadiabete, "no esta con ningun tratamiento")
replace ningun_medicamento_diabetes = 1 if Medicamentoydosisparadiabete == "ninguno solo dieta"
replace ningun_medicamento_diabetes = 1 if Medicamentoydosisparadiabete == "no toma"
replace ningun_medicamento_diabetes = 1 if Medicamentoydosisparadiabete == "ninguno"
replace ningun_medicamento_diabetes = 1 if Medicamentoydosisparadiabete == "no"

order metformina sulfon insulina plantas_medicinales otro_medicamento_diabetes desconocido ningun_medicamento_diabetes

//, after(AG)
//drop Medicamentoydosisparadiabete AF AG

destring Etnia, replace
														
														
														
																// variable name and label cleanup (01142023).do //
														
														
														
														
// Rename variablesid_anterior edad localidad fechadeencuentro glucosaalazar glucosaalazar_fecha glucosaalazar_clase glucosaenayunas glucosaenayunas_fecha glucosaenayunas_clase historia_diabetes historia_hipertension historia_enfermedad_higado historia_enfermedad_renal metformina sulfon insulina remedio_natural_dm medicamento_dm_otro medicamento_dm_desconocido medicamento_dm_ningun pesokg pesokg_fecha pesokg_clase alturacm alturacm_fecha alturacm_clase presionsistolica presionsistolica_fecha presionsistolica_clase presiondiastolica presiondiastolica_fecha presiondiastolica_clase gravida sehaabortado numerohijosvivos estadomenstrual estadoplanificacion agencia etnia idiomapreferido distritosalud enalapril losartan amlodipina remedio_natural_hta medicamento_hta_desconocido medicamento_hta_otro medicamento_hta_ningun

rename OldIdentificationNumber id_anterior
rename Edad edad
rename Localidad localidad
rename FechadeEncuentro fechadeencuentro
// rename historia_diabetes
// rename historia_hipertension
// rename historia_enfermedad_higado
// rename historia_enfermedad_renal
// rename metformina
//rename sulfon
// rename insulina
rename plantas_medicinales remedio_natural_dm
rename otro_medicamento_diabetes medicamento_dm_otro
rename desconocido medicamento_dm_desconocido
rename ningun_medicamento_diabetes medicamento_dm_ningun

gen enalapril = 0
label variable enalapril "Uso de enalapril para hipertension"
replace enalapril = 1 if ustrregexm(Bloodpressuremedication, "enalapril")

gen losartan = 0
label variable losartan "Uso de losartan para hipertension"
replace losartan = 1 if ustrregexm(Bloodpressuremedication, "losartan")

gen amlodipina = 0
label variable amlodipina "Uso de amlodipina para hipertension"
replace amlodipina = 1 if ustrregexm(Bloodpressuremedication, "amlodip")

gen remedio_natural_hta = 0
label variable remedio_natural_hta "Uso de remedio natural para hipertension"
replace remedio_natural_hta = 1 if ustrregexm(Bloodpressuremedication, "natural")

gen medicamento_hta_desconocido = 0
label variable medicamento_hta_desconocido "Uso de medicamento desconocido para hipertension"
replace medicamento_hta_desconocido = 1 if ustrregexm(Bloodpressuremedication, "unknown")

gen medicamento_hta_otro = 0
label variable medicamento_hta_otro "Uso de otro desconocido para hipertension"
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "aspirina")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "atenolol")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "benzepril")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "bisoprolol")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "candesartan")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "captopril")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "indapamide")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "irbesartan")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "natrasol")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "olmesartan")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "ramipril")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "telmisartan")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "valsartan")
replace medicamento_hta_otro = 1 if ustrregexm(Bloodpressuremedication, "vymada")

gen medicamento_hta_ningun = 0
label variable medicamento_hta_ningun "No usa ningun medicamento para hipertension"
replace medicamento_hta_ningun = 1 if Bloodpressuremedication == "no"
replace medicamento_hta_ningun = 1 if Bloodpressuremedication == "none"

drop Bloodpressuremedication
drop Bloodpressuremedication_obsDat
drop Bloodpressuremedication_encoun

order enalapril losartan amlodipina remedio_natural_hta medicamento_hta_otro medicamento_hta_desconocido medicamento_hta_ningun, after(medicamento_dm_ningun)

rename Heightcm alturacm
rename Heightcm_obsDatetime alturacm_fecha
rename Heightcm_encounterType alturacm_clase
rename Gravida gravida
rename Sehaabortadoalgunavez sehaabortado
rename TOTALNUMBEROFLIVINGCHILDREN numerohijosvivos
rename Menstrualstatus estadomenstrual
rename Familyplanningstatus estadoplanificacion
rename Agencia agencia
rename Etnia etnia
rename IdiomaPreferido idiomapreferido
rename HealthDistrict distritosalud
drop notnumeric

// Relabel variables

label variable id_anterior "Numero de identificacion anterior"
label variable edad "Edad"
label variable localidad "Localidad"
label variable fechadeencuentro "Fecha de encuentro"

label variable historia_diabetes "Historia medica del paciente incluye diabetes"
label variable historia_hipertension "Historia medica del paciente incluye hipertension"
label variable historia_enfermedad_higado "Historia medica del paciente incluye enfermedad higado"
label variable historia_enfermedad_renal "Historia medica del paciente incluye enfermedad renal"
label variable metformina "Uso de metformina para diabetes"
label variable sulfon "Uso de sulfonilurea para diabetes"
label variable insulina "Uso de insulina para diabetes"
label variable remedio_natural_dm "Usa de remedio natural para diabetes"
label variable medicamento_dm_otro "Uso de otro medicamento para diabetes"
label variable medicamento_dm_desconocido "Uso de medicamento desconocido para diabetes"
label variable medicamento_dm_ningun "Ningun uso de medicamento para diabetes"
label variable enalapril "Uso de enalapril para hipertension"
label variable losartan "Uso de losartan para hipertension"
label variable amlodipina "Uso de amlodipina para hipertension"
label variable remedio_natural_hta "Uso de remedio natural para hipertension"
label variable medicamento_hta_otro "Uso de otro medicamento para hipertension"
label variable medicamento_hta_desconocido "Uso de medicamento desconocido para hipertension"
label variable medicamento_hta_ningun "Ningun uso de medicamento para hipertension"

label variable alturacm "Altura (cm)"
label variable alturacm_fecha "Altura (cm) (fecha)"
label variable alturacm_clase "Altura (cm) (clase de encuentro)"

label variable gravida "Gravida"

// Drop because there are no observations
drop sehaabortado

label variable numerohijosvivos "Numero de hijos vivos"
label variable estadomenstrual "Estado menstrual"
label variable estadoplanificacion "Estado de planificacion familiar"
label variable agencia "Agencia"
label variable etnia "Etnia"
label variable idiomapreferido "Idioma preferido"
label variable distritosalud "Distrito salud"

// Encode variables

encode estadomenstrual, generate (estadomenstrual2)
drop estadomenstrual
rename estadomenstrual2 estadomenstrual

encode estadoplanificacion, generate (estadoplanificacion2)
drop estadoplanificacion
rename estadoplanificacion2 estadoplanificacion

encode idiomapreferido, generate (idiomapreferido2)
drop idiomapreferido
rename idiomapreferido2 idiomapreferido


replace alturacm_clase = "nuevo" if alturacm_clase == "adultonuevo"
replace alturacm_clase = "reconsulta" if alturacm_clase == "adultoreconsulta"
encode alturacm_clase, generate (alturacm_clase2)
drop alturacm_clase
rename alturacm_clase2 alturacm_clase


label define etnia_label 1 "indigena" 2 "no indigena"
label values etnia etnia_label

// Reorder with demograpahic variables first

// To save space
replace distritosalud = "CCDJ" if distritosalud == "COSIGUA COSIGUA DE JERONIMO"
replace distritosalud = "LsVlts" if distritosalud == "Las Velitas"
gen distritosalud2 = distritosalud
drop distritosalud
rename distritosalud2 distritosalud
label variable distritosalud "Distrito salud"

***** At this point before proceeding it is imperative to reshape long and deal with 
**** the repeated measures variables (BP glucose weight)
**** Peter added 5 sept 2023
*** this will also include calculating AGE AT TIME OF OBSERVATION
*** also need to rename repeat measures variables to permit fidelitous conversion to long

** clean up the numbering of variables (this gets variables in sequential number from 1 to N to make reshaping easier)
rename Glucosaalazar_encounterType* Randomglucose_encounterType#, renumber(1)
rename Glucosaalazar_obsDatetime* Randomglucose_obsDatetime#, renumber(1)
rename Glucosaalazar* Randomglucose#, renumber(1)

foreach x of varlist  glucosaenayunas_encounterType glucosaenayunas_encounterType_ AQ AT AW AZ BC BF BI BL {
         local i = `i' + 1
		 rename `x' Fastingglucose_encounterType`i'
}

rename glucosaenayunas_obsDatetime* Fastingglucose_obsDatetime#, renumber(1)

rename glucosaenayunas* Fastingglucose#,renumber(1)

rename Weightkg_obsDatetime* weight_obsDatetime#, renumber(1)
rename Weightkg_encounterType* weight_encounterType#, renumber(1)
rename Weightkg* weight#, renumber(1)

foreach x of varlist  presinarterialsistlica_obs DX EA ED EG EJ EM EP ES EV {
         local i = `i' + 1
		 rename `x' SBP_obsDatetime`i'
}

foreach x of varlist  presinarterialsistlica_enc DY EB EE EH EK EN EQ ET EW {
         local i = `i' + 1
		 rename `x' SBP_encounterType`i'
}

foreach x of varlist  Presinarterialdiastlica_ob FB FE FH FK FN FQ FT FW FZ {
         local i = `i' + 1
		 rename `x' DBP_obsDatetime`i'
}


foreach x of varlist  Presinarterialdiastlica_en FC FF FI FL FO FR FU FX GA {
         local i = `i' + 1
		 rename `x' DBP_encounterType`i'
}

rename presinarterialsistlica* SBP#, renumber(1)
rename Presinarterialdiastlica* DBP#, renumber(1)


***. this section just resolves remaing incompatibilities (mostly type mismatch from dates as strings)

// already converted to date format so commented out the next 5 commands (SA, 9/16) (may be a Stata version issue)

// strdate Randomglucose_obsDatetime2 Randomglucose_obsDatetime3 Randomglucose_obsDatetime4 Randomglucose_obsDatetime5 Randomglucose_obsDatetime6 Randomglucose_obsDatetime7 Randomglucose_obsDatetime8,s("md20y") f(%td)

// strdate Fastingglucose_obsDatetime2 Fastingglucose_obsDatetime3 Fastingglucose_obsDatetime4 Fastingglucose_obsDatetime5 Fastingglucose_obsDatetime6 Fastingglucose_obsDatetime7 Fastingglucose_obsDatetime8 Fastingglucose_obsDatetime9 Fastingglucose_obsDatetime10,s("md20y") f(%td)


// strdate weight_obsDatetime2 weight_obsDatetime3 weight_obsDatetime4 weight_obsDatetime5 weight_obsDatetime6 weight_obsDatetime7 weight_obsDatetime8 weight_obsDatetime9 weight_obsDatetime10,s("md20y") f(%td)

// strdate SBP_obsDatetime2 SBP_obsDatetime3 SBP_obsDatetime4 SBP_obsDatetime5 SBP_obsDatetime6 SBP_obsDatetime7 SBP_obsDatetime8 SBP_obsDatetime9 SBP_obsDatetime10,s("md20y") f(%td)

// strdate DBP_obsDatetime2 DBP_obsDatetime3 DBP_obsDatetime4 DBP_obsDatetime5 DBP_obsDatetime6 DBP_obsDatetime7 DBP_obsDatetime8 DBP_obsDatetime9 DBP_obsDatetime10,s("md20y") f(%td)

tostring Randomglucose_encounterType10, replace

tostring Randomglucose_encounterType9, replace

*** reshape long

reshape long Randomglucose Randomglucose_obsDatetime Randomglucose_encounterType Fastingglucose Fastingglucose_obsDatetime Fastingglucose_encounterType weight weight_obsDatetime weight_encounterType SBP SBP_obsDatetime SBP_encounterType DBP DBP_obsDatetime DBP_encounterType, i(id_anterior) j(measurement)

*** this next section finds the first observation for each variable (date and corresponding observation, copies that into a new variable (by, sort gen) and then  backfills that into all the other slots for the new variables. there are other more sophisticated ways to do this if we think for example that later observations would be important but I think for our purposes here we just look for the first observation of each 

*** I think it is best (going forward) to classify data based on DATES and ignore the encounter type (nuevo/reconsulta) as there is a first principles theoretical problem - imagine a nurse only has time to do a glucose they might use the reconsult form (it is short) and then go back later and fill in the other. Or in reverse there may be no time for labs or machine broken and so first data gets entered into a reconsulta. So I think just ignore the visit type classifications for now

* Randomglucose
by id_anterior (Randomglucose_obsDatetime), sort: gen Randomglucosefirstdate = Randomglucose_obsDatetime[1]
format Randomglucosefirstdate %td

gen Randomglucosefirstvalue=.
replace Randomglucosefirstvalue=Randomglucose 
bysort id_anterior: replace Randomglucosefirstvalue = Randomglucosefirstvalue[_n-1] if missing(Randomglucosefirstvalue)

* fasting glucose
by id_anterior (Fastingglucose_obsDatetime), sort: gen Fastingglucosefirstdate = Fastingglucose_obsDatetime[1]
format Fastingglucosefirstdate %td

gen Fastingglucosefirstvalue=.
replace Fastingglucosefirstvalue=Fastingglucose 
bysort id_anterior: replace Fastingglucosefirstvalue = Fastingglucosefirstvalue[_n-1] if missing(Fastingglucosefirstvalue)

* weight

by id_anterior (weight_obsDatetime), sort: gen weightfirstdate = weight_obsDatetime[1]
format weightfirstdate %td

gen weightfirstvalue=.
replace weightfirstvalue=weight 
bysort id_anterior: replace weightfirstvalue = weightfirstvalue[_n-1] if missing(weightfirstvalue)

* SBP

by id_anterior (SBP_obsDatetime), sort: gen SBPfirstdate = SBP_obsDatetime[1]
format SBPfirstdate %td

gen SBPfirstvalue=.
replace SBPfirstvalue=SBP 
bysort id_anterior: replace SBPfirstvalue = SBPfirstvalue[_n-1] if missing(SBPfirstvalue)


* DBP


by id_anterior (DBP_obsDatetime), sort: gen DBPfirstdate = DBP_obsDatetime[1]
format DBPfirstdate %td

gen DBPfirstvalue=.
replace DBPfirstvalue=DBP 
bysort id_anterior: replace DBPfirstvalue = DBPfirstvalue[_n-1] if missing(DBPfirstvalue)


***** end this part ******
*****************
**************
************
*** this next section calculates the age at the observation time point, because the other age variable in the data set is age at the time we extracted the data from OpenMRSID

gen ageRandomglucosefirstdate = age(dob, Randomglucosefirstdate)

gen ageFastingglucosefirstdate = age(dob, Fastingglucosefirstdate)

gen ageweightfirstdate = age(dob, weightfirstdate)

gen ageSBPfirstdate = age(dob, SBPfirstdate)

gen ageDBPfirstvalue = age(dob, DBPfirstdate)


**** end section


*** this section repeats prior data cleaning rules for outliers, but now properly done for the date that was pulled forward as first measurement




// hist SBPfirstvalue
replace SBPfirstvalue=. if SBPfirstvalue < 70 

// hist DBPfirstvalue
replace DBPfirstvalue=. if DBPfirstvalue < 40

//these are extremely conservative cuts, so david can look to decide if wants to exclude, e.g DBP >150. Max dbp 150 in data set and max sbp 220 in dataset, I didn't exclude any at this upper limit


// hist Weightkg

replace weightfirstvalue=. if weightfirstvalue > 150 | weightfirstvalue < 35

//glucose

replace Fastingglucosefirstvalue=. if Fastingglucosefirstvalue  < 40 | Fastingglucosefirstvalue  > 600

replace Randomglucosefirstvalue=. if Randomglucosefirstvalue < 40 | Randomglucosefirstvalue > 600

**** end section


** leave to Stephen order preference so deprecated this for now

// Reorder
//order id_anterior edad localidad agencia distritosalud etnia idiomapreferido fechadeencuentro glucosaalazar glucosaalazar_fecha glucosaalazar_clase glucosaenayunas glucosaenayunas_fecha glucosaenayunas_clase pesokg pesokg_fecha pesokg_clase alturacm alturacm_fecha alturacm_clase presionsistolica presionsistolica_fecha presionsistolica_clase presiondiastolica presiondiastolica_fecha presiondiastolica_clase gravida numerohijosvivos estadomenstrual estadoplanificacion historia_diabetes historia_hipertension historia_enfermedad_higado historia_enfermedad_renal metformina sulfon insulina remedio_natural_dm medicamento_dm_otro medicamento_dm_desconocido medicamento_dm_ningun enalapril losartan amlodipina remedio_natural_hta medicamento_hta_otro medicamento_hta_desconocido medicamento_hta_ningun



// you can theoretically drop all the original data and just focus on the first extracted values, always remembering the decision made to take forward only the first date observation for each. 

// Stata doesn't recognize Randomglucose1 because data are still long, commentingout (SA, 9/16)
// drop Randomglucose1-DBPfirstvalue10

drop Randomglucose Randomglucose_obsDatetime Randomglucose_encounterType Fastingglucose Fastingglucose_obsDatetime Fastingglucose_encounterType weight weight_obsDatetime weight_encounterType SBP SBP_obsDatetime SBP_encounterType DBP DBP_obsDatetime DBP_encounterType

// drop original medical history variablesid_anterior
drop BP-CF


/// reshape wide

bysort id_anterior (measurement): drop if _n>1

drop measurement


// Saving dataset as "dataset_step1.dta"

															// analysis work feb 11 2023.do //

// Definiing diabetes (Stephen, 10/14/2023)

// Drop if no glucose values are available
drop if missing(Fastingglucosefirstvalue) & missing(Randomglucosefirstvalue)

// In the original go round, these 3 observations were still included but were later dropped b/c though to be corrupted
// drop if id_anterior == "  8/5/2018" | id_anterior == "  9/2/2018"
// dropped 2
// drop if id_anterior == "20-02-2018"

// Generate diabetes variable, default is no diabetes
gen diabetes = 0
label variable diabetes "Diabetes (glucosa al azar >= 200 mg/dl o glucosa en ayunas >= 126 por historia medica)"

// Diabetes if already recorded in medical history or takes diabetes meds
replace diabetes = 1 if historia_diabetes==1 | metformina == 1 | sulf==1 | insulina==1 | remedio_natural_dm==1 | medicamento_dm_otro==1 | medicamento_dm_desconocido==1

// Diabetes if only fasting va and fasting elevated
replace diabetes = 1 if !missing(Fastingglucosefirstvalue) & missing(Randomglucosefirstvalue) & Fastingglucosefirstvalue >= 126

// Diabetes if only random available and random elevated
replace diabetes = 1 if !missing(Randomglucosefirstvalue) & missing(Fastingglucosefirstvalue) & Randomglucosefirstvalue >= 200

// Diabetes if both fasting and random available and the later one of the two is elevated
replace diabetes = 1 if !missing(Randomglucosefirstvalue) & !missing(Fastingglucosefirstvalue) & Fastingglucosefirstdate>=Randomglucosefirstdate & Fastingglucosefirstvalue>=126
replace diabetes = 1 if !missing(Randomglucosefirstvalue) & !missing(Fastingglucosefirstvalue) & Randomglucosefirstdate>Fastingglucosefirstdate & Randomglucosefirstvalue>=200																									
// Commenting out diabetes definition below given that we are now defining as above (SA, 10/14/2023)															
// peter modifying directly here 14 sept 2023														
// Create diabetes variable (random glucose >= 200 mg/dl or fasting >126 or noted in medical history)
// gen diabetes = .
// label variable diabetes "Diabetes (glucosa al azar >= 200 mg/dl o glucosa en ayunas >= 126 por historia medica)"
// replace diabetes = 0 if !missing(Fastingglucosefirstvalue)
// replace diabetes = 0 if !missing(Randomglucosefirstvalue)
// DM if fasting > 126 and fasting date is same or greater than the random date (this means it was double checked and includes the scenario where the nurse did this check on different dates but entered both values on the same date at a later documentation stage)
// replace diabetes = 1 if Fastingglucosefirstvalue >= 126 & Fastingglucosefirstdate >= Randomglucosefirstdate
// DM if random > 200
// replace diabetes = 1 if Randomglucosefirstvalue >= 200 
// DM by history
// replace diabetes = 1 if historia_diabetes == 1
// replace diabetes = 1 if metformina == 1 | sulf==1 | insulina==1 | remedio_natural_dm==1 | medicamento_dm_otro==1 | medicamento_dm_desconocido==1
// tab diabetes, missing
// label define diabetes_label 0 "no diabetes" 1 "diabetes"
// label values diabetes diabetes_label



// The lines below consider diabetes to be present if either random glucose is >= 200 or fasting glucose is >= 126
//replace diabetes = 0 if !missing(Fastingglucosefirstvalue)
//replace diabetes = 0 if !missing(Randomglucosefirstvalue)
//replace diabetes = 1 if !missing(glucosaalazar) & glucosaalazar >= 200
//replace diabetes = 1 if !missing(glucosaenayunas) & glucosaenayunas >= 126
// The line below captures the case in which random glucose is elevated followed by a subsequent normal fasting glucose, considered as no diabetes
//replace diabetes = 0 if glucosaenayunas_fecha > glucosaalazar_fecha & !missing(glucosaenayunas_fecha) & glucosaenayunas < 126 & glucosaalazar >= 200
// The lines below capture the case in which diabetes is recorded in the medical history or the patient is taking medication for diabetes, considered as diabetes
//replace diabetes = 1 if historia_diabetes == 1
//replace diabetes = 1 if metformina == 1 | sulf==1 | insulina==1 | remedio_natural_dm==1 | medicamento_dm_otro==1 | medicamento_dm_desconocido==1
//tab diabetes
//tab diabetes, missing

//label define diabetes_label 0 "no diabetes" 1 "diabetes"
//label values diabetes diabetes_label


// Create hypertension variable (systolic >= 130 OR diastolic >= 80 OR noted in medical history)
gen hipertension = .
label variable hipertension "Hipertension (sistolica >= 130, diastolica >= 80 or por historia medica"

replace hipertension = 0 if !missing(SBPfirstvalue)
replace hipertension = 0 if !missing(DBPfirstvalue)
replace hipertension = 1 if !missing(SBPfirstvalue) & SBPfirstvalue >= 130
replace hipertension = 1 if !missing(DBPfirstvalue) & DBPfirstvalue >= 80
replace hipertension = 1 if historia_hipertension == 1
replace hipertension = 1 if enalapril==1 | losartan==1 | amlodipina==1 | remedio_natural_hta==1 | medicamento_hta_otro==1 | medicamento_hta_desconocido==1
tab hipertension
tab hipertension, missing

label define hipertension_label 0 "no hipertension" 1 "hipertension"
label values hipertension hipertension_label

// Create BMI variable



gen imc = .
label variable imc "Indice de masa corporal (IMC)"
replace imc = 0 if !missing(weightfirstvalue)
replace imc = 0 if !missing(alturacm)
replace imc = weightfirstvalue/(alturacm/100)^2
replace imc = round(imc,.1)

// BMI category
gen imc_clase = . 
label variable imc_clase "Clase de IMC"
replace imc_clase = 1 if (imc < 18.5) 
replace imc_clase = 2 if (imc >= 18.5 & imc < 25)
replace imc_clase = 3 if (imc < 30 & imc >= 25)
replace imc_clase = 4 if (imc < 35 & imc >= 30)
replace imc_clase = 5 if (imc < 40 & imc >= 35)
replace imc_clase = 6 if (imc >= 40)
replace imc_clase = . if (imc == .) 

label define imc_clase_label 1 "Bajo peso (<18.5)" 2 "Normal (18.5-25)" 3 "Sobrepeso (25-30)" 4 "Obesidad Grado 1 (30-35)" 5 "Obesidad Grado 2 (35-40)" 6 "Obesidad Grado 3 (40+)"
label values imc_clase imc_clase_label
tab imc_clase
tab imc_clase, missing

tab imc_clase diabetes, row

tab imc_clase hipertension, row

******* DECIDe what age to use for defining age categories
******* I wrote code and variables for age at each of the data points for glucose, bp etc but given missingness they are probably not as useful as I was hoping. I think this means you have to use fecha de encuentro to calculate age for the measurment. This means you missing like ~69 observations where this variable is missing, but no great alternative. You could impute age down the road for those 70 if you wish. 

******* don't use edad! this is the age at which I extracted the data which could be off by several years or more


gen ageforstandardization = age(dob, fechadeencuentro)


*******
*******
*******
******* PETER STOPPED HERE ******
**************
*******
*********************
*********************
*********************
*********************

// saving dataset as "dataset_step2.dta"

// Createa a variable for taking diabetes meds
gen diabetes_med = 0

replace diabetes_med = 1 if metformina==1 | sulfon==1 | insulina==1 | remedio_natural_dm==1 | medicamento_dm_otro==1 | medicamento_dm_desconocido==1

// Preliminnary flowsheet

// Total Observations (n=14,033)

// browse Randomglucosefirstvalue Fastingglucosefirstvalue if missing(Randomglucosefirstvalue) & missing(Fastingglucosefirstvalue)

// drop if missing(Randomglucosefirstvalue) & missing(Fastingglucosefirstvalue)

// At least one glucose value (n=13,646)

// browse if diabetes_med == 1 // 282 on some kind of diabetes medication

// browse if historia_diabetes == 1 // 668 with prior diagnosed diabetes

// browse if diabetes_med == 1 | historia_diabetes == 1 // 699 meet both cases

// drop if diabetes_med==1 | historia_diabetes==1
// 699 observations deleted

// At least one glucose value and prior history of diabetes or on diabetes meds (n=699)

// At least one glucose value and no prior diagnosed diabetes or dm meds (n=12,947)

// Some code not included for the sake of clarity (wherever there is an indentation, the code was left out)

// browse Randomglucosefirstvalue Fastingglucosefirstvalue if missing(Randomglucosefirstvalue) & !missing(Fastingglucosefirstvalue)
// Only fasting available (n=976)
			// Only fasting available & fasting >= 126 (n=61)
			// Only fasting available & fasting < 126 (n=915)

// browse Randomglucosefirstvalue Fastingglucosefirstvalue if !missing(Randomglucosefirstvalue) & missing(Fastingglucosefirstvalue)
// Only random available (n=10,581)
			// Only random available & random >= 200 (n=165)
			// Only random available & ranodm < 200 (n=10,416)

// browse Randomglucosefirstvalue Fastingglucosefirstvalue if !missing(Randomglucosefirstvalue) & !missing(Fastingglucosefirstvalue)
// Both fasting and random available (n=1,390)
			// Either random or fasting elevated (n=271)
						// Both fasting and random elevated (n=143)
									// fasting before random (n=3)
									// random before fasting or same date (n=140)
						// Random elevated but not fasting (n=72)
									// fasting before random (n=14)
									// random before fasting or same date (n=58)
						// Fasting elevated but not random (n=56)
									// fasting before random (n=12)
									// random before fasting or same date (n=44)
			// Neither random nor fasting elevated (n=1,119)











// Can drop these now that BP first value extracted (SA 9/16)
drop Medicamentoydosisparadiabete presiÃnarterialsistÃlica_1 presiÃnarterialsistÃlica_2 presiÃnarterialsistÃlica_3 presiÃnarterialsistÃlica_4 presiÃnarterialsistÃlica_5 presiÃnarterialsistÃlica_6 presiÃnarterialsistÃlica_7 presiÃnarterialsistÃlica_8 presiÃnarterialsistÃlica_9 PresiÃnarterialdiastÃlica_1 PresiÃnarterialdiastÃlica_2 PresiÃnarterialdiastÃlica_3 PresiÃnarterialdiastÃlica_4 PresiÃnarterialdiastÃlica_5 PresiÃnarterialdiastÃlica_6 PresiÃnarterialdiastÃlica_7 PresiÃnarterialdiastÃlica_8 PresiÃnarterialdiastÃlica_9

// Rename edad to ageatextraction (SA 9/16)
rename edad ageatextraction

// Reorder
// order id_anterior edad localidad agencia distritosalud etnia idiomapreferido fechadeencuentro glucosaalazar glucosaalazar_fecha glucosaalazar_clase glucosaenayunas glucosaenayunas_fecha glucosaenayunas_clase diabetes pesokg pesokg_fecha pesokg_clase alturacm alturacm_fecha alturacm_clase imc imc_clase presionsistolica presionsistolica_fecha presionsistolica_clase presiondiastolica presiondiastolica_fecha presiondiastolica_clase hipertension gravida numerohijosvivos estadomenstrual estadoplanificacion historia_diabetes historia_hipertension historia_enfermedad_higado historia_enfermedad_renal metformina sulfon insulina remedio_natural_dm medicamento_dm_otro medicamento_dm_desconocido medicamento_dm_ningun enalapril losartan amlodipina remedio_natural_hta medicamento_hta_otro medicamento_hta_desconocido medicamento_hta_ningun

// Age adjustment

// Age standardized estimates to the WHO world population estimates
// Import WHO data

			// First, prepare age groups
		
		    gen age_group = ""
		
			replace age_group = "0-4" if ageforstandardization >= 0 & ageforstandardization <5
			replace age_group = "5-9" if ageforstandardization >= 5 & ageforstandardization <10
			replace age_group = "10-14" if ageforstandardization >= 10 & ageforstandardization <15
			replace age_group = "15-19" if ageforstandardization >= 15 & ageforstandardization <20
			replace age_group = "20-24" if ageforstandardization >= 20 & ageforstandardization <25
			replace age_group = "25-29" if ageforstandardization >= 25 & ageforstandardization <30
			replace age_group = "30-34" if ageforstandardization >= 30 & ageforstandardization <35
			replace age_group = "35-39" if ageforstandardization >= 35 & ageforstandardization <40
			replace age_group = "40-44" if ageforstandardization >= 40 & ageforstandardization <45
			replace age_group = "45-49" if ageforstandardization >= 45 & ageforstandardization <50
			replace age_group = "50-54" if ageforstandardization >= 50 & ageforstandardization <55
			
     		replace age_group = "55-59" if ageforstandardization >= 55 & ageforstandardization <60
			replace age_group = "60-64" if ageforstandardization >= 60 & ageforstandardization <65
			replace age_group = "65-69" if ageforstandardization >= 65 & ageforstandardization <70
			replace age_group = "70-74" if ageforstandardization >= 70 & ageforstandardization <75
			replace age_group = "75-79" if ageforstandardization >= 75 & ageforstandardization <80
			replace age_group = "80-84" if ageforstandardization >= 80 & ageforstandardization <85
			replace age_group = "85-89" if ageforstandardization >= 85 & ageforstandardization <95
			replace age_group = "90-94" if ageforstandardization >= 90 & ageforstandardization <95
			replace age_group = "95-99" if ageforstandardization >= 95 & ageforstandardization <100
			replace age_group = "100+" if ageforstandardization>=100
			
			// THen merge WHO standard population data with working database, by age_group
		
			merge m:1 age_group using "WHO population data.dta"
			drop _merge

// Localidad department data			

			merge m:1 localidad using "localidad departamento.dta"
            drop if _merge==2
			drop _merge
			label variable departamento "Departamento"
			order departamento, after(localidad)
			replace departamento = "Unknown" if localidad ==""
			
			sort ageforstandardization
			
			tab departamento diabetes, row

// Outcomes

// Diabetes prevalence
proportion diabetes // crude
proportion diabetes, over (imc_clase) // crude over bmi
proportion diabetes, stdize(age_group) stdweight(who_population) // age-standarized
proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase) // age-standarized over bmi

// Hypertension prevalence
proportion hipertension // crude
// proportion hipertension, over (imc_clase) // crude over bmi
proportion hipertension, stdize(age_group) stdweight(who_population) // age-standarized
// proportion hipertension, stdize(age_group) stdweight(who_population) over(imc_clase) // age-standarized over bmi

// BMI categories prevalence
proportion imc_clase // crude
proportion imc_clase, stdize(age_group) stdweight(who_population) // age-standarized

// ROC curves for diabetes by fasting criteria among those with no prior diabetes history
// gen diabetes_sinhistoria = .
// replace diabetes_sinhistoria = 0 if !missing(glucosaalazar)
// replace diabetes_sinhistoria = 0 if !missing(glucosaenayunas)
// replace diabetes_sinhistoria = 1 if !missing(glucosaalazar) & glucosaalazar >= 200
// replace diabetes_sinhistoria = 1 if !missing(glucosaenayunas) & glucosaenayunas >= 126
// replace diabetes_sinhistoria = . if historia_diabetes == 1
// tab diabetes_sinhistoria
// tab diabetes_sinhistoria, missing
// roctab diabetes_sinhistoria imc, detail graph summary


// order id_anterior metformina sulfon insulina remedio_natural_dm medicamento_dm_otro medicamento_dm_desconocido medicamento_dm_ningun enalapril losartan amlodipina remedio_natural_hta medicamento_hta_otro medicamento_hta_desconocido medicamento_hta_ningun ageatextraction localidad departamento fechadeencuentro historia_diabetes historia_hipertension historia_enfermedad_higado historia_enfermedad_renal CH CI alturacm alturacm_fecha gravida numerohijosvivos agencia etnia dob estadomenstrual estadoplanificacion idiomapreferido alturacm_clase distritosalud Randomglucosefirstdate Randomglucosefirstvalue Fastingglucosefirstdate Fastingglucosefirstvalue weightfirstdate weightfirstvalue SBPfirstdate SBPfirstvalue DBPfirstdate DBPfirstvalue ageRandomglucosefirstdate ageFastingglucosefirstdate ageweightfirstdate ageSBPfirstdate ageDBPfirstvalue diabetes hipertension imc imc_clase ageforstandardization diabetes_med age_group WHOWorldStandard Recalculationtoaddto100000 who_population StandardForSEERStat

// Suspect these can be dropped as well at this point (SA 10/14)
drop CH CI

// saving as "dataset_step3.dta"

													// merge do file.do //													
												
/*													
/// merging Puente admin data with chart data

//Puente poverty data in two nonoverlapping forms (more recently, poverty stoplight; 
//before, poverty probability index

//this one is the "new" poverty stoplight data

// id is string in master file

use "/Users/prohloff/Downloads/dataset_step3.dta", clear
sort id_anterior

//several bad MRN is base file

 drop if id_anterior == ""
//dropped 5
drop if id_anterior == "  8/5/2018" | id_anterior == "  9/2/2018"
//dropped 2

gen byte notnumeric = real(id_anterior)==. /*makes indicator for obs w/o numeric values*/
tab notnumeric /*==1 where nonnumeric characters*/
list id_anterior if notnumeric==1

drop if id_anterior == "20-02-2018"


destring id_anterior, replace
save "/Users/prohloff/Downloads/dataset_step3.dta", replace

clear all 
import excel "/Users/prohloff/Downloads/FV2022.xlsx", sheet("Hoja1") firstrow
drop caseid rol_in_group branch_code current_facilitator current_group_id
drop dob
drop vigencia
drop product_code
drop state
drop status
drop CAV_2022 Mora_2022 Chanim_2022 Paralelo_2022 AsistenciaAgricola_2022 Clinica_salud_2022 Individual_2022 Solidario_2022
drop Idioma_frecuente Marital_status_2022

rename client_id id_anterior

sort id_anterior
duplicates list id_anterior

//merge with DB


merge 1:1 id_anterior using "/Users/prohloff/Downloads/dataset_step3.dta"

//drop if only in the admin database but not openmrs database
drop if _merge == 1
drop _merge

save "/Users/prohloff/Downloads/dataset_step4_pr.dta"

// import second database (newer and smaller)
drop Organization Agencia Createdat Surveynumber Familycode Documentnumber Familyname Surveyuser
drop Origin-Countryofbirth
drop Email Phonenumber Postcode Streetdescription Familylocation
drop NodeCuotas-AC
 drop Localizar Consulta Examen Mortalidadinfantil Totalsurveytimeminutes


rename Códigoclienta id_anterior

sort id_anterior
duplicates list id_anterior
+------------------------+
  | Group   Obs   id_ant~r |
  |------------------------|
  |     1   252      75172 |
  |     1   253      75172 |
  |     2   672     110778 |
  |     2   673     110778 |

  
  
  drop if id_anterior == 75172 & Actividadnegocio == "Animales"
  
  drop if id_anterior == 110778 & Peopleinhousehold == "2"
  
   save "/Users/prohloff/Downloads/dataset2 stoplight only mrn.dta"

   // second merge
   
    use "/Users/prohloff/Downloads/dataset_11feb2023_stoplightmerge.dta"
	
	merge 1:1 id_anterior using "/Users/prohloff/Downloads/dataset2 stoplight only mrn.dta"
	
drop if _merge == 2

save "/Users/prohloff/Downloads/dataset_11feb2023_stoplightmergex2.dta"
*/

clear
use "dataset_step4_pr.dta"


											// analysis work mar 35 2023.do //


// Encode variables

encode marital_status, gen(estado_civil)
drop marital_status

replace business_activity = "" if business_activity == "0"
replace business_activity = "" if business_activity == "Sin Información"

replace business_activity = "Agricultura" if business_activity == "Alimentos"
replace business_activity = "Agricultura" if business_activity == "Animales"
replace business_activity = "Agricultura" if business_activity == "Ganaderia"
replace business_activity = "Agricultura" if ustrregexm(business_activity, "Crianza")
replace business_activity = "Agricultura" if ustrregexm(business_activity, "Cultivo")
replace business_activity = "Agricultura" if ustrregexm(business_activity, "Produccion")
replace business_activity = "Agricultura" if ustrregexm(business_activity, "Producción")

replace business_activity = "Comercio" if ustrregexm(business_activity, "Venta")
replace business_activity = "Comercio" if ustrregexm(business_activity, "Compra y venta")
replace business_activity = "Comercio" if ustrregexm(business_activity, "Comercio")

replace business_activity = "Textiles" if business_activity == "Artesanias"
replace business_activity = "Textiles" if business_activity == "Elaboración de productos de artesanía"
replace business_activity = "Textiles" if business_activity == "Elaboración de ropa"
replace business_activity = "Textiles" if business_activity == "Elaboración de ropa típica"

replace business_activity = "Servicios" if business_activity == "Elaboración de comida"
replace business_activity = "Servicios" if business_activity == "Refacciones"
replace business_activity = "Servicios" if business_activity == "Refrescos y licuados"
replace business_activity = "Servicios" if business_activity == "Tortillería "

replace business_activity = "Otro" if business_activity != "Agricultura" & business_activity != "Comercio" & business_activity != "Textiles" & business_activity != "Servicios" & business_activity != ""  & !missing(business_activity)

encode business_activity, gen(negocio)
drop business_activity

encode Clasificacion_area_2022, gen(urban_rural)
drop Clasificacion_area_2022

// Merge 2 language variables
// First convert names in Idioma variable to comparable
replace Idioma = "spanish" if Idioma == "Español"
replace Idioma = "kiche" if Idioma == "K'iche'"
replace Idioma = "kaqchikel" if Idioma == "Kackchiquel"
replace Idioma = "mam" if Idioma == "Mam"
replace Idioma = "mam" if Idioma == "mam"
replace Idioma = "qeqchi" if Idioma == "Q'eqchi'"
replace Idioma = "sacapulteco" if Idioma == "Sakapulteko"
replace Idioma = "tzutujil" if Idioma == "Tz'utujil"
replace Idioma = "ixil" if Idioma == "ixil"
replace Idioma = "" if Idioma == "Sin información"
// Then merge
// browse Idioma idiomapreferido
decode idiomapreferido, generate(idiomapreferido2)
replace idiomapreferido2 = Idioma if missing(idiomapreferido2)
replace idiomapreferido2 = "" if missing(idiomapreferido2)
replace idiomapreferido2 = "" if idiomapreferido2 == "indigena"
drop idiomapreferido
	// Create a variable Mayan language vs. Spanish
	gen idiomamaya = 1 if idiomapreferido == "kiche" | idiomapreferido2 == "kaqchikel" | idiomapreferido2 == "mam" | idiomapreferido2 == "qeqchi" | idiomapreferido2 == "sacapulteco" | idiomapreferido2 == "tzutujil" | idiomapreferido2 == "ixil" | idiomapreferido2 == "indigena" 
	replace idiomamaya = 0 if idiomapreferido2 == "spanish"
encode idiomapreferido2, generate(idiomapreferido)
drop idiomapreferido2
drop Idioma

// Diabetes prevalence
proportion diabetes // crude
proportion diabetes, over (imc_clase) // crude over bmi
proportion diabetes, stdize(age_group) stdweight(who_population) // age-standarized
proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase) // age-standarized over bmi
proportion diabetes, stdize(age_group) stdweight(who_population) over(estado_civil) // age-standarized over marital status
proportion diabetes, stdize(age_group) stdweight(who_population) over(negocio) // age-standarized over business activity
proportion diabetes, stdize(age_group) stdweight(who_population) over(urban_rural) // age-standarized over urban rural
proportion diabetes, stdize(age_group) stdweight(who_population) over(idiomamaya) // age-standarized over idioma
proportion diabetes, stdize(age_group) stdweight(who_population) over(etnia) // age-standarized over idioma

// Hypertension prevalence
proportion hipertension // crude
// proportion hipertension, over (imc_clase) // crude over bmi
proportion hipertension, stdize(age_group) stdweight(who_population) // age-standarized
// proportion hipertension, stdize(age_group) stdweight(who_population) over(imc_clase) // age-standarized over bmi
proportion hipertension, stdize(age_group) stdweight(who_population) over(imc_clase) // age-standarized over bmi
proportion hipertension, stdize(age_group) stdweight(who_population) over(estado_civil) // age-standarized over marital status
proportion hipertension, stdize(age_group) stdweight(who_population) over(negocio) // age-standarized over business activity
proportion hipertension, stdize(age_group) stdweight(who_population) over(urban_rural) // age-standarized over urban rural
proportion hipertension, stdize(age_group) stdweight(who_population) over(idiomamaya) // age-standarized over idioma


// BMI categories prevalence
proportion imc_clase // crude
proportion imc_clase, stdize(age_group) stdweight(who_population) // age-standarized

// ROC curves for diabetes by fasting criteria among those with no prior diabetes history
// gen diabetes_sinhistoria = .
// replace diabetes_sinhistoria = 0 if !missing(glucosaalazar)
// replace diabetes_sinhistoria = 0 if !missing(glucosaenayunas)
// replace diabetes_sinhistoria = 1 if !missing(glucosaalazar) & glucosaalazar >= 200
// replace diabetes_sinhistoria = 1 if !missing(glucosaenayunas) & glucosaenayunas >= 126
// replace diabetes_sinhistoria = . if historia_diabetes == 1
// tab diabetes_sinhistoria
// tab diabetes_sinhistoria, missing
// roctab diabetes_sinhistoria imc, detail graph summary

tab departamento diabetes, row

save "dataset step5.dta"




																	// Merging PPI data april 2023.data //
													
													
													
													
clear

import delimited "PPI data prior to 2020.csv", varnames(1) clear

rename client_id id_anterior

// merge 1:1 id_anterior using "C:\Stephen\Wuqu' Kawoq\Cardiometabolic Risk Factors\dataset step5.dta"
// variable id does not uniquely identify observations in the master data

sort id_anterior
// Mark duplicates and triplicates with dup variable
quietly by id_anterior:  gen dup = cond(_N==1,0,_n)

// Deal with the triplicates first
// browse id_anterior dup if dup > 2

/* Triplicates
#N/A
24954
379
60557
63905
64602
68861
69507
*/

// Drop the N/A observation
drop if id_anterior == "#N/A"

// Check to see which of the triplicates have a 2019 score 
// browse id_anterior ppiv2_2019_score dup if id_anterior == "24954" | id_anterior == "379" | id_anterior == "60557" | id_anterior == "63905" | id_anterior == "64602" | id_anterior == "68861" | id_anterior == "69507"

// Each triplicate has at least one version with the 2019 score
// Drop the triplicate observations that do not have a 2019 score
drop if (id_anterior == "24954" | id_anterior == "379" | id_anterior == "60557" | id_anterior == "63905" | id_anterior == "64602" | id_anterior == "68861" | id_anterior == "69507") & ppiv2_2019_score == "---"
// Browse again
// browse id_anterior ppiv2_2019_score dup if id_anterior == "24954" | id_anterior == "379" | id_anterior == "60557" | id_anterior == "63905" | id_anterior == "64602" | id_anterior == "68861" | id_anterior == "69507"
// Drop the one remaining duplicate
drop if id_anterior=="64602" & dup==3
// Browse again
// browse id_anterior ppiv2_2019_score dup if id_anterior == "24954" | id_anterior == "379" | id_anterior == "60557" | id_anterior == "63905" | id_anterior == "64602" | id_anterior == "68861" | id_anterior == "69507"
// Good
// Change value of dup to 0 for these observations
replace dup = 0 if id_anterior == "24954" | id_anterior == "379" | id_anterior == "60557" | id_anterior == "63905" | id_anterior == "64602" | id_anterior == "68861" | id_anterior == "69507"

// Now deal with duplicates
// browse id_anterior dup if dup > 0
// Huge number

// Check to see which of the duplicates have a 2019 score 
// browse id_anterior ppiv2_2019_score dup if dup > 0
// Most but not all have a 2019 score

// Deal first with the duplicates that do not have a 2019 score. Take the first one. That is, drop the second
// First mark the first as a non-duplicate
replace dup = 0 if id_anterior[_n] == id_anterior[_n+1] & ppiv2_2019_score[_n] == "---" & ppiv2_2019_score[_n+1] == "---"
// Then drop the second one
drop if id_anterior[_n] == id_anterior[_n-1] & ppiv2_2019_score[_n] == "---" & ppiv2_2019_score[_n-1] == "---"

// Remaining duplicates should have a 2019 score
// browse id_anterior ppiv2_2019_score dup if dup > 0
// It appears they do
// Drop if duplicate and no 2019 score
// If the first one has a 2019 score, mark it as non-duplicate and delete the second one
replace dup = 7 if id_anterior[_n] == id_anterior[_n+1] & ppiv2_2019_score[_n] != "---"
drop if id_anterior[_n] == id_anterior[_n-1] & dup[_n-1] == 7
replace dup = 0 if dup == 7
// If the second one has a 2019 score, mark it as non-duplicate and delete the first one
replace dup = 9 if id_anterior[_n] == id_anterior[_n-1] & ppiv2_2019_score[_n] != "---"
drop if id_anterior[_n] == id_anterior[_n+1] & dup[_n+1] == 9
replace dup = 0 if dup == 9

// All duplicates should be gone now
// browse id_anterior ppiv2_2019_score dup if dup > 0
drop dup

// Re-attempt merge
destring id_anterior, replace
merge 1:1 id_anterior using "dataset step5.dta", gen(_merge_ppi)

// Drop if not matched from PPI dataset
drop if _merge_ppi==1

save "dataset step6.dta"

																		// analysis work april 16 2023.do //
															
															
															
drop PPI_probability_2022 PPI_income_2022 PPI_category_2022 PovertystoplightReds PovertystoplightYellows PovertystoplightGreens _merge ppiv21719 ppiv1_2012_level ppiv1_2012_score problnpv1_2012 lnpv1_2012 ppiv1_2013_level ppiv1_2013_score problnpv1_2013 lnpv1_2013 ppiv1_2014_level ppiv1_2014_score problnpv1_2014 lnpv1_2014 ppiv1_2015_level ppiv1_2015_score problnpv1_2015 lnpv1_2015 ppiv1_2016_level ppiv1_2016_score problnpv1_2016 lnpv1_2016 ppiv1_2017_level ppiv1_2017_score problnpv1_2017 lnpv1_2017 ppiv1_2018_level ppiv1_2018_score problnpv1_2018 lnpv1_2018 ppiv2_2017_level ppiv2_2017_score problnpv2_2017 lnpv2_2017 ppiv2_2018_level ppiv2_2018_score problnpv2_2018 lnpv2_2018 _merge_ppi

// Change variables to appropriate variable types for new ppi data

replace ppiv2_2019_level = "" if ppiv2_2019_level=="---"
encode(ppiv2_2019_level), gen(ppiv2_2019_level2)
drop ppiv2_2019_level
rename ppiv2_2019_level2 ppiv2_2019_level

replace lnpv2_2019 = "" if lnpv2_2019=="---"
encode(lnpv2_2019), gen(lnpv2_20192)
drop lnpv2_2019
rename lnpv2_20192 lnpv2_2019

replace ppiv2_2019_score = "" if ppiv2_2019_score=="---"
destring ppiv2_2019_score, replace

replace problnpv2_2019 = "" if problnpv2_2019=="---"
destring problnpv2_2019, replace

// Recode ppiv2_2019_level
// Encima (code = 1)
// Vulnerable (code = 4; recode to 2)
// Vulnerable Extrema (code = 5; recode to 3)
// Pobreza (code = 2; recode to 4)
// Pobreza exrema (code = 3; recode to 5)

// Score | Poverty level
// <20 | Pobreza Extrema
// 20-44.9 | Pobreza
// 45-54.9 | Vulnerable Extrema
// 55-69.9 | Vulnerable
// 70+ | Encima

// Score | Probability of Poverty
// 20 | 93.4%
// 45 | 33.8%
// 55 | 15.6%
// 70 | 0.7%

gen problnpv2_2019_level_new = ""
replace problnpv2_2019_level_new = "Encima" if ppiv2_2019_level==1
replace problnpv2_2019_level_new = "Vulnerable" if ppiv2_2019_level==4
replace problnpv2_2019_level_new = "Vulnerable Extrema" if ppiv2_2019_level==5
replace problnpv2_2019_level_new = "Pobreza" if ppiv2_2019_level==2
replace problnpv2_2019_level_new = "Pobreza Extrema" if ppiv2_2019_level==3


label define ppi_label 1 "Encima" 2 "Vulnerable" 3 "Vulnerable Extrema" 4 "Pobreza" 5 "Pobreza Extrema"

drop ppiv2_2019_level
encode problnpv2_2019_level_new, gen(ppiv2_2019_level) label(ppi_label)
drop problnpv2_2019_level_new

// https://www.stata.com/manuals/drecode.pdf

// Add a variable for age group
gen edad_clase = ""
replace edad_clase = "<20" if ageforstandardization < 20
replace edad_clase = "20-29" if ageforstandardization >= 20 & ageforstandardization < 30
replace edad_clase = "30-39" if ageforstandardization >= 30 & ageforstandardization < 40
replace edad_clase = "40-49" if ageforstandardization >= 40 & ageforstandardization < 50
replace edad_clase = "50-69" if ageforstandardization >= 50 & ageforstandardization < 60
replace edad_clase = "60-69" if ageforstandardization >= 60 & ageforstandardization < 70
replace edad_clase = "70-79" if ageforstandardization >= 70 & ageforstandardization < 80
replace edad_clase = "80-89" if ageforstandardization >= 80 & ageforstandardization < 90
label define edad_label 1 "<20" 2 "20-29" 3 "30-39" 4 "40-49" 5 "50-69" 6 "60-69" 7 "70-79" 8 "80-89"
encode edad_clase, gen(edad_clase2) label(edad_label)
drop edad_clase
rename edad_clase2 edad_clase

// Add a varibale for children numerohijosvivos
gen numerohijos_clase = ""
replace numerohijos_clase = "0-2" if numerohijosvivos <= 2
replace numerohijos_clase = "3-5" if numerohijosvivos >= 3 & numerohijosvivos <= 5
replace numerohijos_clase = "6-8" if  numerohijosvivos >= 6 & numerohijosvivos <= 8
replace numerohijos_clase = "9+" if  numerohijosvivos > 9
label define numerohijos_label 1 "0-2" 2 "3-5" 3 "6-8" 4 "9+"
encode numerohijos_clase, gen(numerohijos_clase2) label(numerohijos_label)
drop numerohijos_clase
rename numerohijos_clase2 numerohijos_clase

save "dataset step7.dta"

															// demo table may 05 2023.do //
															
																					
tabstat edad, statistics(median p25 p75)
tabstat imc, statistics(median p25 p75)
tabstat numerohijosvivos, statistics(median p25 p75)

// Create quantiles for poverty_quantile
tabstat problnpv2_2019 , statistics(p25 p50 p75)
// Variable |       p25       p50       p75
// problnp~2019 |      .156      .466      .782

gen poverty_quantile = 1 if problnpv2_2019 < .156 // Lowest probability of poverty
replace poverty_quantile = 2 if problnpv2_2019 >= .156  & problnpv2_2019 < .466
replace poverty_quantile = 3 if problnpv2_2019 >= .466 & problnpv2_2019 < .782
replace poverty_quantile = 4 if problnpv2_2019 >= .782 // Highest probability of poverty
replace poverty_quantile = . if missing(problnpv2_2019)

gen imc_clase2 = imc_clase // Consolidated BMI class
label variable imc_clase2 "IMC Clase Consolidada"
replace imc_clase2 = 4 if imc_clase2 == 4 | imc_clase2 == 5 | imc_clase2 == 6
label define bmi_label 1 "Bajo peso" 2 "Normal" 3 "Sobrepeso" 4 "Obesidad"
label values imc_clase2 bmi_label

tab imc_clase2

tab imc_clase2, missing
tab etnia, missing
tab idiomamaya, missing
tab urban_rural, missing
tab negocio, missing
tab poverty_quantile, missing
tab estado_civil, missing

tab edad, missing
tab imc, missing
tab numerohijosvivos, missing

// Calculate missing data
// replace imc_clase2 = 5 if missing(imc_clase2)
// replace etnia = 3 if missing(etnia)
// replace idiomamaya = 2 if missing(idiomamaya)
// replace urban_rural = 3 if missing(urban_rural)
// replace poverty_quantile = 5 if missing(poverty_quantile)

// Data for demographics table
// capture ssc install logout
// cd "${root}"
// logout, save("output.csv") excel replace: tab edad_clase
// logout, save("output1.csv") excel replace: tab imc_clase
// logout, save("output2.csv") excel replace: tab etnia
// logout, save("output3.csv") excel replace: tab idiomamaya
// logout, save("output4.csv") excel replace: tab urban_rural
// logout, save("output5.csv") excel replace: tab departamento
// logout, save("output6.csv") excel replace: tab negocio
// logout, save("output7.csv") excel replace: tab ppiv2_2019_level
// logout, save("output8.csv") excel replace: tab estado_civil
// logout, save("output9.csv") excel replace: tab numerohijos_clase

															// fix departamento and langauge data 10/21/2023 //
															
// these got messed up because of formatting issues

// Localidad department data			

			drop departamento

			merge m:1 localidad using "localidad departamento2.dta"
            drop if _merge==2
			drop _merge
			label variable departamento "Departamento"
			order departamento, after(localidad)
			replace departamento = "Unknown" if localidad ==""
			
			sort ageforstandardization
			
			tab departamento diabetes, row
			
			
tab idiomapreferido

replace idiomamaya = 0 if idiomapreferido==2 | idiomapreferido==3 | idiomapreferido==4 // accounts for eapaã±ol eapoã±ol edpaã±ol (5 changes made)

save "dataset step8.dta"

															// multiple imputation 06 30 2023.do //
	


drop if diabetes == .

// --- Prepare dataset for multiple imputation ---

// re-generating urban variable as 0/1 required for mi
gen urban = .
replace urban = 1 if urban_rural == 2
replace urban = 0 if urban_rural == 1
tab urban

// convert etnia to 0/1 variable
gen indigenous = .
replace indigenous = 1 if etnia == 1
replace indigenous = 0 if etnia == 2

// cleaning estado_civil
tab estado_civil, m
replace estado_civil = . if estado_civil == 1 // 1 is coded as "0" which is meaningless

// encode departamento
encode departamento, gen(departamento_encoded)		
tab departamento_encoded, m
replace departamento_encoded = . if departamento_encoded == 10



// --- Perform multiple imputation ---

// Set things up

set seed 12345
mi set wide
mi xtset, clear

// Specify imputed and regular variables

mi register imputed indigenous idiomamaya urban poverty_quantile imc_clase2
mi register regular negocio diabetes departamento_encoded gravida alturacm SBPfirstvalue weightfirstvalue ageforstandardization
		
//	Imputation model

		mi impute chained /// (pmm, knn(5)) alturacm imc glucosaalazar presionsistolica  /// continuous (none in this dataset)
						  (logit,augment noisily) urban /// binary
						  (mlogit,augment noisily) indigenous idiomamaya  /// categorical unordered
						  (ologit,augment noisily) poverty_quantile imc_clase2 /// categorical ordered
			= i.diabetes i.departamento_encoded i.gravida c.alturacm c.SBPfirstvalue c.weightfirstvalue c.ageforstandardization, add(50) force
				
			// 'i' specifies categorical, 'c' specifies continuous variable
			// add(2) specifies number of iterations
	
// Calculate diabetes prevalences by characteristic using imputed data

// // // Age

proportion diabetes,  over(edad_clase) coeflegend // this gets the legend
mi estimate: proportion diabetes, over(edad_clase) base

// 20-29 (2) vs. < 20 (1)
mi estimate (_b[1.diabetes@2bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@2bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// 30-39 (3) vs. < 20 (1)
mi estimate (_b[1.diabetes@3bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@3bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// 40-49 (4) vs. < 20 (1)
mi estimate (_b[1.diabetes@4bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@4bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// 50-59 (5) vs. < 20 (1)
mi estimate (_b[1.diabetes@5bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@5bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// 60-69 (6) vs. < 20 (1)
mi estimate (_b[1.diabetes@6bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@6bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// 70-79 (7) vs. < 20 (1)
mi estimate (_b[1.diabetes@7bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@7bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// 80-89 (8) vs. < 20 (1)
mi estimate (_b[1.diabetes@8bn.edad_clase] - _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)
mi estimate (_b[1.diabetes@8bn.edad_clase] / _b[1.diabetes@1.edad_clase]): proportion diabetes, over(edad_clase)

// // // BMI class

proportion diabetes,  over(imc_clase2) coeflegend // this gets the legend
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2) base

// Underweight (1) vs. normal weight (2)
mi estimate (_b[1.diabetes@1bn.imc_clase2] - _b[1.diabetes@2.imc_clase2]): proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2)
mi estimate (_b[1.diabetes@1bn.imc_clase2] / _b[1.diabetes@2.imc_clase2]): proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2)

// Overweight  (3) vs. normal weight (2)
mi estimate (_b[1.diabetes@3.imc_clase2] - _b[1.diabetes@2.imc_clase2]): proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2)
mi estimate (_b[1.diabetes@3.imc_clase2] / _b[1.diabetes@2.imc_clase2]): proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2)

// Obesity (4) vs. normal weight (2)
mi estimate (_b[1.diabetes@4.imc_clase2] - _b[1.diabetes@2.imc_clase2]): proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2)
mi estimate (_b[1.diabetes@4.imc_clase2] / _b[1.diabetes@2.imc_clase2]): proportion diabetes, stdize(age_group) stdweight(who_population) over(imc_clase2)

// // // Ethnicity

proportion diabetes, over(indigenous) coeflegend
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) over(indigenous) base

// Indigenous (1) versus non-indigenous (0)
mi estimate (_b[1.diabetes@1.indigenous] - _b[1.diabetes@0bn.indigenous]): proportion diabetes, stdize(age_group) stdweight(who_population) over(indigenous) base
mi estimate (_b[1.diabetes@1.indigenous] / _b[1.diabetes@0bn.indigenous]): proportion diabetes, stdize(age_group) stdweight(who_population) over(indigenous) base

// // // Language

proportion diabetes, over(idiomamaya) coeflegend
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) over(idiomamaya) base

// Mayan language (1) versus Spanish (0)
mi estimate (_b[1.diabetes@1.idiomamaya] - _b[1.diabetes@0bn.idiomamaya]): proportion diabetes, stdize(age_group) stdweight(who_population) over(idiomamaya) base
mi estimate (_b[1.diabetes@1.idiomamaya] / _b[1.diabetes@0bn.idiomamaya]): proportion diabetes, stdize(age_group) stdweight(who_population) over(idiomamaya) base

// // // Setting

proportion diabetes, over(urban) coeflegend
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) over(urban) base

// Urban (1) vs rural (0)
mi estimate (_b[1.diabetes@1.urban] - _b[1.diabetes@0bn.urban]): proportion diabetes, stdize(age_group) stdweight(who_population) over(urban) base
mi estimate (_b[1.diabetes@1.urban] / _b[1.diabetes@0bn.urban]): proportion diabetes, stdize(age_group) stdweight(who_population) over(urban) base

// // // Poverty

proportion diabetes, over(poverty_quantile) coeflegend
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// Quartile 2 (2) vs quartile 1 (1)
mi estimate (_b[1.diabetes@2.poverty_quantile] - _b[1.diabetes@1bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base
mi estimate (_b[1.diabetes@2.poverty_quantile] / _b[1.diabetes@1bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// Quartile 3 (3) vs quartile 1 (1)
mi estimate (_b[1.diabetes@3.poverty_quantile] - _b[1.diabetes@1bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base
mi estimate (_b[1.diabetes@3.poverty_quantile] / _b[1.diabetes@1bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// Quartile 4 (4) vs quartile 1 (1)
mi estimate (_b[1.diabetes@4.poverty_quantile] - _b[1.diabetes@1bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base
mi estimate (_b[1.diabetes@4.poverty_quantile] / _b[1.diabetes@1bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// Overall diabetes prevalence
proportion diabetes
mi estimate: proportion diabetes
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) // Age adjusted


// // //

// Redo age with first category as 18-29, last category as 70+

gen edad_clase2 = .
replace edad_clase2 = edad_clase
replace edad_clase2 = 2 if ageforstandardization < 20
replace edad_clase2 = 7 if ageforstandardization > 79

// Now...
// 2 - 18-29
// 3 - 30-39
// 4 - 40-49
// 5 - 50-59
// 6 - 60-69
// 7 - 70+

label define edad_clase2label 2 "18-29" 3 "30-39" 4 "40-49" 5 "50-59" 6 "60-69" 7 "70+"
label values edad_clase2 edad_clase2label

// // // Redo Age with edad_clase2

proportion diabetes,  over(edad_clase2) coeflegend // this gets the legend
mi estimate: proportion diabetes, over(edad_clase2) base

// 30-39 (3) vs. 18-29 (2)
mi estimate (_b[1.diabetes@3bn.edad_clase2] - _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)
mi estimate (_b[1.diabetes@3bn.edad_clase2] / _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)

// 40-49 (4) vs. 18-29 (2)
mi estimate (_b[1.diabetes@4bn.edad_clase2] - _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)
mi estimate (_b[1.diabetes@4bn.edad_clase2] / _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)

// 50-59 (5) vs. 18-29 (2)
mi estimate (_b[1.diabetes@5bn.edad_clase2] - _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)
mi estimate (_b[1.diabetes@5bn.edad_clase2] / _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)

// 60-69 (6) vs. 18-29 (2)
mi estimate (_b[1.diabetes@6bn.edad_clase2] - _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)
mi estimate (_b[1.diabetes@6bn.edad_clase2] / _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)

// 70-79 (7) vs. 18-29 (2)
mi estimate (_b[1.diabetes@7bn.edad_clase2] - _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)
mi estimate (_b[1.diabetes@7bn.edad_clase2] / _b[1.diabetes@2.edad_clase2]): proportion diabetes, over(edad_clase2)


// Reversing the order of the quantiles without re-running the imputation
// It gets tricky, because 1 will still code for wealthiest, but we will label it "4 - Wealthiest" so as not to have to modify all 50 imputed versions of the variable

label define poverty_quantile_label 1 "4 - Wealthiest" 2 "3 - " 3 "2 - " 4 "1 - Poorest"
label values poverty_quantile poverty_quantile_label

// And now we'll run the comparison using 4 "1 - Poorest" as the reference

// // // Poverty

proportion diabetes, over(poverty_quantile) coeflegend
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// "2 -" (3) vs "1 - Poorest" (4)
mi estimate (_b[1.diabetes@3.poverty_quantile] - _b[1.diabetes@4bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base
mi estimate (_b[1.diabetes@3.poverty_quantile] / _b[1.diabetes@4bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// "3 - " (2) vs "1 - Poorest" (4)
mi estimate (_b[1.diabetes@2.poverty_quantile] - _b[1.diabetes@4bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base
mi estimate (_b[1.diabetes@2.poverty_quantile] / _b[1.diabetes@4bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

// "4 - Healthiest" (1) vs "1 - Poorest" (4)
mi estimate (_b[1.diabetes@1.poverty_quantile] - _b[1.diabetes@4bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base
mi estimate (_b[1.diabetes@1.poverty_quantile] / _b[1.diabetes@4bn.poverty_quantile]): proportion diabetes, stdize(age_group) stdweight(who_population) over(poverty_quantile) base

save "dataset step9.dta"

// Additional stats for paper
//


// Overall diabetes prevalence
proportion diabetes
mi estimate: proportion diabetes
mi estimate: proportion diabetes, stdize(age_group) stdweight(who_population) // Age adjusted

// Break down of those with fasting only, random only or both
browse if !missing(Fastingglucosefirstvalue) & missing(Randomglucosefirstvalue)
browse if !missing(Randomglucosefirstvalue) & missing(Fastingglucosefirstvalue)
browse if !missing(Randomglucosefirstvalue) & !missing(Fastingglucosefirstvalue)

// Percent in poverty
tabstat problnpv2_2019, statistics(mean)

// Number with diabetes
tab diabetes

// Newly diagnosed versus already known
gen reported_or_meds = 0
replace reported_or_meds = 1 if historia_diabetes==1 | metformina == 1 | sulf==1 | insulina==1 | remedio_natural_dm==1 | medicamento_dm_otro==1 | medicamento_dm_desconocido==1
gen newly_diagnosed = 0
replace newly_diagnosed = 1 if diabetes == 1 & reported_or_meds == 0
tab diabetes
proportion reported_or_meds if diabetes==1
proportion newly_diagnosed if diabetes==1

// Determine percentage of women with last glucose as fasting and last glucose as random
browse Fastingglucosefirstdate Fastingglucosefirstvalue Randomglucosefirstdate Randomglucosefirstvalue if ( Fastingglucosefirstdate>=Randomglucosefirstdate | missing(Randomglucosefirstvalue) ) & !missing(Fastingglucosefirstvalue) // 2,171 (Fasting glucose is last glucose available)
browse Fastingglucosefirstdate Fastingglucosefirstvalue Randomglucosefirstdate Randomglucosefirstvalue if ( Randomglucosefirstdate>Fastingglucosefirstdate | missing(Fastingglucosefirstvalue) ) & !missing(Randomglucosefirstvalue) // 11,471 (Randomg glucose is last glucose available)